1 module modular_db.sql_preprocessor;
2 
3 // This file contains simplified SQLite's lexer, patched for our needs.
4 // Original one can be viewed here:
5 // https://www.sqlite.org/src/artifact?ci=trunk&filename=src/tokenize.c
6 // Known bugs:
7 // * Tcl-style parameters can be parsed incorrectly: https://www.sqlite.org/lang_expr.html#varparam
8 
9 import std.algorithm.comparison: among;
10 import std.array: Appender;
11 import std.typecons: Tuple;
12 import std.utf: byCodeUnit;
13 
14 private pure @safe:
15 
16 alias _ByteString = typeof("".byCodeUnit!(const(char)[ ]));
17 
18 public enum SqlPreprocessorOptions {
19     none,
20     quoteLowercaseIdents = 0x1,
21     quoteUppercaseIdents = 0x2,
22     dedent               = 0x4,
23     stripComments        = 0x8,
24 }
25 
26 bool _isLineBreak(char c) nothrow @nogc {
27     return c == '\n';
28 }
29 
30 bool _isSpace(char c) nothrow @nogc {
31     return !!c.among!(' ', '\t', '\r', '\f');
32 }
33 
34 bool _isDigit(char c) nothrow @nogc {
35     return c - '0' < 10u;
36 }
37 
38 bool _isLower(char c) nothrow @nogc {
39     return c - 'a' < 26u;
40 }
41 
42 bool _isUpper(char c) nothrow @nogc {
43     return c - 'A' < 26u;
44 }
45 
46 bool _isIdentStartNotLower(char c) nothrow @nogc {
47     return c - 'A' < 26u || c.among!('_', ':', '@', '$', '#') || c & 0x80;
48 }
49 
50 bool _isIdentStartNotUpper(char c) nothrow @nogc {
51     return c - 'a' < 26u || c.among!('_', ':', '@', '$', '#') || c & 0x80;
52 }
53 
54 bool _isIdent(char c) nothrow @nogc {
55     return (c | 0x20) - 'a' < 26u || c - '0' < 10u || c == '_' || c == '$' || c & 0x80;
56 }
57 
58 bool _isStringStart(char c) nothrow @nogc {
59     return c == '\'' || c == '"' || c == '`';
60 }
61 
62 const(char)[ ] _parseQualifier(ref _ByteString s) {
63     // - | [1-9] \d* | (?!0)
64     if (s.empty)
65         return null;
66     switch (s.front) {
67     case '-':
68         s.popFront();
69         return "-";
70 
71     case '1': .. case '9':
72         const tmp = s.source;
73         s._skipWhile!_isDigit();
74         return tmp[0 .. $ - s.length];
75 
76     case '0':
77         throw new Exception("Can't qualify an identifier with 0");
78 
79     default:
80         return null;
81     }
82 }
83 
84 Tuple!(const(char)[ ], q{schema}, const(char)[ ], q{moduleId})
85 _parseQualifiers(ref _ByteString s) {
86     import std.algorithm.searching: skipOver;
87     import std.conv: text;
88     import std.range.primitives: empty;
89 
90     /+
91         (?:
92             (?&qualifier) \.
93         )?
94         (?:
95             (?&qualifier) \|
96         |   (?![\d-])
97         )
98     +/
99     const q0 = _parseQualifier(s);
100     if (!s.empty)
101         switch (s.front) {
102         case '|':
103             s.popFront();
104             return typeof(return)(null, q0);
105 
106         case '.':
107             s.popFront();
108             const q1 = _parseQualifier(s);
109             if (s.skipOver('|') || q1.empty)
110                 return typeof(return)(q0, q1);
111             throw new Exception(text("Invalid qualifier: expected '|' after '", q0, '.', q1, '\''));
112 
113         default:
114             break;
115         }
116     if (q0.empty)
117         return typeof(return).init;
118     throw new Exception(text("Invalid qualifier: expected '|' after '", q0, '\''));
119 }
120 
121 char _skipWhile(alias pred)(ref _ByteString _s) nothrow @nogc {
122     auto s = _s;
123     scope(success) _s = s;
124     char c;
125     do
126         s.popFront();
127     while (!s.empty && pred((c = s.front)));
128     return c;
129 }
130 
131 alias _skipIdent = _skipWhile!_isIdent;
132 alias _skipAnyWhitespace = _skipWhile!(c => _isSpace(c) || _isLineBreak(c));
133 
134 _ByteString _copyBracketedIdent(_ByteString s, Appender!string app) {
135     import std.exception: enforce;
136 
137     auto lag = s;
138     // Copy everything until ']', escaping quotation marks in the process.
139     while (true) {
140         enforce(!s.empty, "Unclosed square bracket");
141         const c = s.front;
142         s.popFront();
143         if (c == ']')
144             break;
145         if (c == '"') {
146             app ~= lag.source[0 .. $ - s.length];
147             app ~= '"';
148             lag = s;
149         } else
150             enforce(!_isLineBreak(c), "Unclosed square bracket");
151     }
152     app ~= lag.source[0 .. $ - s.length - 1];
153     return s;
154 }
155 
156 _ByteString _skipPlaceholder(_ByteString s) /+nothrow+/ {
157     import std.format: FormatException, FormatSpec;
158     import std.range: NullSink, dropOne;
159 
160     if (s[1] == '%') // "%%"
161         return s[2 .. $];
162     auto fmt = FormatSpec!char(s.source);
163     NullSink sink;
164     try {
165         const specFound = fmt.writeUpToNextSpec(sink);
166         assert(specFound);
167     } catch (FormatException)
168         return s.dropOne(); // Skip '%'.
169     // catch (Exception e)
170     //     assert(false, e.msg);
171     return s[$ - fmt.trailing.length .. $];
172 }
173 
174 bool _skipSingleLineComment(bool prepareToStrip)(ref _ByteString _s) nothrow @nogc {
175     auto s = _s;
176     scope(success) _s = s;
177     static if (prepareToStrip)
178         bool allowedToStrip = true;
179     for (s = s[2 .. $]; !s.empty; s.popFront()) {
180         const c = s.front;
181         if (_isLineBreak(c)) {
182             static if (prepareToStrip)
183                 return allowedToStrip;
184             else
185                 return false;
186         }
187         static if (prepareToStrip)
188             if (c == '%')
189                 allowedToStrip = false;
190     }
191     return false; // Must not strip a comment on the last line.
192 }
193 
194 bool _skipMultiLineComment(bool prepareToStrip)(ref _ByteString _s) nothrow @nogc {
195     auto s = _s;
196     scope(success) _s = s;
197     static if (prepareToStrip)
198         bool allowedToStrip = true;
199     bool prevStar;
200     s = s[2 .. $];
201     while (!s.empty) {
202         const c = s.front;
203         s.popFront();
204         if (prevStar && c == '/') {
205             static if (prepareToStrip)
206                 return allowedToStrip;
207             else
208                 return false;
209         }
210         prevStar = c == '*';
211         static if (prepareToStrip)
212             if (c == '%')
213                 allowedToStrip = false;
214     }
215     return false; // Must not strip an unclosed comment.
216 }
217 
218 public Tuple!(string, q{sql}, bool, q{usesSchema}, bool, q{usesModuleId})
219 preprocessSql(SqlPreprocessorOptions options)(const(char)[ ] sql, size_t firstAvailableArg) {
220     import std.array: appender;
221     import std.conv: toChars;
222     import std.range.primitives: empty;
223 
224     if (sql.empty)
225         return typeof(return).init;
226     auto app = appender!string();
227     auto s = sql.byCodeUnit();
228     auto schemaDefaultIndex = toChars(firstAvailableArg);
229     auto moduleIdDefaultIndex = toChars(firstAvailableArg + 1);
230     bool usesSchema;
231     bool usesModuleId;
232     char c = s.front;
233 
234     static if (options & SqlPreprocessorOptions.quoteLowercaseIdents) {
235         static assert(!(options & SqlPreprocessorOptions.quoteUppercaseIdents),
236             "Cannot set both `quoteLowercaseIdents` and `quoteUppercaseIdents`",
237         );
238         enum shouldQuote = true;
239         alias isIdentStart = _isLower;
240         alias isKeywordStart = _isIdentStartNotLower;
241         enum charX = 'x';
242     } else static if (options & SqlPreprocessorOptions.quoteUppercaseIdents) {
243         enum shouldQuote = true;
244         alias isIdentStart = _isUpper;
245         alias isKeywordStart = _isIdentStartNotUpper;
246         enum charX = 'X';
247     } else
248         enum shouldQuote = false;
249     enum shouldDedent = !!(options & SqlPreprocessorOptions.dedent);
250     enum shouldStripComments = !!(options & SqlPreprocessorOptions.stripComments);
251 
252     static if (shouldDedent)
253         if (_isSpace(c)) {
254             const crlf = c == '\r' && s.length >= 2 && s[1] == '\n';
255             const nonSpace = s._skipAnyWhitespace();
256             // Retain one space at the beginning of the string.
257             if (s.empty)
258                 return typeof(return)(crlf ? "\r\n" : [immutable char(c)], false, false);
259             if (crlf)
260                 app ~= "\r\n";
261             else
262                 app ~= c;
263             c = nonSpace;
264         }
265     auto lag = s;
266 mainLoop:
267     while (true) {
268         assert(!s.empty, "Stepped inside the main parsing loop with an empty string");
269         static if (shouldQuote) {
270             // Keyword or named parameter.
271             if (isKeywordStart(c)) {
272                 c = s._skipIdent();
273                 if (s.empty)
274                     break mainLoop;
275                 continue mainLoop;
276             }
277             // Identifier.
278             if (isIdentStart(c)) {
279                 if (c == charX && s.length >= 2 && (c = s[1]) == '\'') {
280                     // Wait, it's a blob string.
281                     s.popFront();
282                     goto someString;
283                 }
284                 app ~= lag.source[0 .. $ - s.length];
285                 app ~= '"';
286                 lag = s;
287                 c = s._skipIdent();
288                 app ~= lag.source[0 .. $ - s.length];
289                 app ~= '"';
290                 lag = s;
291                 if (s.empty)
292                     break mainLoop;
293                 continue mainLoop;
294             }
295         }
296         // Line break.
297         static if (shouldDedent)
298             if (_isLineBreak(c)) {
299             lineBreak:
300                 s.popFront();
301                 if (s.empty)
302                     break mainLoop;
303                 c = s.front;
304                 if (_isSpace(c) || _isLineBreak(c)) {
305                     // The following line is indented.
306                     app ~= lag.source[0 .. $ - s.length];
307                     c = s._skipAnyWhitespace();
308                     lag = s;
309                     if (s.empty)
310                         break mainLoop;
311                 }
312                 continue mainLoop;
313             }
314         // Qualified name.
315         if (c == '[') {
316             app ~= lag.source[0 .. $ - s.length];
317             s.popFront();
318 
319             const q = _parseQualifiers(s);
320             if (q.schema != "-") {
321                 app ~= `"%`;
322                 if (q.schema.empty) {
323                     app ~= schemaDefaultIndex;
324                     usesSchema = true;
325                 } else
326                     app ~= q.schema;
327                 app ~= `$s".`;
328             }
329             if (q.moduleId != "-") {
330                 app ~= `"%`;
331                 if (q.moduleId.empty) {
332                     app ~= moduleIdDefaultIndex;
333                     usesModuleId = true;
334                 } else
335                     app ~= q.moduleId;
336                 app ~= `$s`;
337             } else
338                 app ~= '"';
339 
340             lag = s = _copyBracketedIdent(s, app);
341             app ~= '"';
342             if (s.empty)
343                 break mainLoop;
344             c = s.front;
345             continue mainLoop;
346         }
347         // Some kind of strings.
348         if (_isStringStart(c)) {
349         someString:
350             const delim = c;
351             while (true) {
352                 s.popFront();
353                 if (s.empty)
354                     break mainLoop;
355                 c = s.front;
356                 if (c == delim) {
357                     s.popFront();
358                     if (s.empty)
359                         break mainLoop;
360                     c = s.front;
361                     if (c != delim) // Escaped delimiter.
362                         continue mainLoop;
363                 }
364             }
365         }
366         static if (shouldQuote) {
367             // Number.
368             if (_isDigit(c)) {
369                 // Must parse `1.e2` as a single token.
370                 c = s._skipWhile!(c => _isIdent(c) || c == '.');
371                 if (s.empty)
372                     break mainLoop;
373                 continue mainLoop;
374             }
375             // Printf placeholder (must not quote letters in it).
376             if (c == '%' && s.length >= 2) {
377                 s = s._skipPlaceholder();
378                 if (s.empty)
379                     break mainLoop;
380                 c = s.front;
381                 continue mainLoop;
382             }
383         }
384         // Single-line comment.
385         if (c == '-' && s.length >= 2 && s[1] == '-') {
386             static if (shouldStripComments) {
387                 const commentStart = s.length;
388                 if (s._skipSingleLineComment!true()) {
389                     app ~= lag.source[0 .. $ - commentStart];
390                     lag = s;
391                 }
392             } else
393                 s._skipSingleLineComment!false();
394             static if (shouldDedent)
395                 goto lineBreak; // Careful: we have a wrong value of `c` at the moment.
396             else {
397                 c = '\n';
398                 continue mainLoop;
399             }
400         }
401         // Multi-line comment.
402         if (c == '/' && s.length >= 2 && s[1] == '*') {
403             static if (shouldStripComments) {
404                 const commentStart = s.length;
405                 if (s._skipMultiLineComment!true()) {
406                     app ~= lag.source[0 .. $ - commentStart];
407                     app ~= ' '; // Comments can delimit tokens.
408                     lag = s;
409                 }
410             } else
411                 s._skipMultiLineComment!false();
412             if (s.empty)
413                 break mainLoop;
414             c = s.front;
415             continue mainLoop;
416         }
417         // Some other character.
418         s.popFront();
419         if (s.empty)
420             break mainLoop;
421         c = s.front;
422     }
423     app ~= lag.source;
424     return typeof(return)(app.data, usesSchema, usesModuleId);
425 }